#' ---
#' title: "DOL-ILAB SDC - Nepal Round 3_FinalRaw"
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup and crate dictionary

filename <- "Nepal Round 3_FinalRaw" # !!!Update filename
source ("functions_1.7.R")

#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables

#' #Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("SbjNam", 
              "FrScName", 
              "RvwName", 
              "IDR3_3", 
              "IDR3_18", 
              "IDR3_19", 
              "IDR3_23") 
mydata <- mydata[!names(mydata) %in% dropvars]

#' #Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects
#' !!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables=c("Srvyr"))

#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#'  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("IDR3_6_19", 
             "IDR3_6_22", 
             "IDR3_6_23", 
             "IDR3_6_24", 
             "IDR3_6_26", 
             "IDR3_6_30", 
             "IDR3_6_31", 
             "IDR3_6_35", 
             "IDR3_7") 
mydata <- encode_location (variables= locvars, missing=999999)

#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" of 10 or less. 

break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1, 
                "25-34" =2, 
                "35-44" =3, 
                "45-54" =4, 
                "55-64" =5, 
                "65 and older" =6, 
                "NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)

# !!!Include relevant variables in list below

indirect_PII <- c("IDR3_20", 
                  "D_9", 
                  "HC2_O1", 
                  "HC2_O2", 
                  "HC2_O3", 
                  "HC2_O4", 
                  "HC2_O5", 
                  "HC2_O6", 
                  "HC3", 
                  "HC4_1", 
                  "HC4_2", 
                  "HC4_3", 
                  "HC4_4", 
                  "D_4", 
                  "Inc_17", 
                  "MC_39x3_1b", 
                  "MC_39x3_1d", 
                  "Stigma1_2", 
                  "HT_13x3_1x3", 
                  "HT_13x3_4x3", 
                  "HT_13x3_7x3", 
                  "HT_13x3_13x3", 
                  "LE20_1_r3", 
                  "I_3_conjoint2_3_r3", 
                  "G1_04", 
                  "P3", 
                  "P3A", 
                  "P4", 
                  "P4A", 
                  "P8_O1", 
                  "P8_O2", 
                  "P8_O3", 
                  "P12A", 
                  "P13A_O1", 
                  "P13A_O2", 
                  "P9B", 
                  "P10B", 
                  "P12B", 
                  "P13B_O1", 
                  "P13B_O2", 
                  "I_1_P9C", 
                  "I_1_P10C", 
                  "I_1_P11C", 
                  "I_1_P11_A3", 
                  "I_1_P12C", 
                  "I_1_P13C_O1", 
                  "I_2_P9C", 
                  "I_2_P10C", 
                  "I_2_P11_A3", 
                  "I_2_P12C", 
                  "I_3_P9C", 
                  "I_3_P10C", 
                  "I_3_P11_A3", 
                  "I_3_P12C", 
                  "I_3_P13C_O1", 
                  "I_4_P9C", 
                  "I_4_P10C", 
                  "I_4_P11_A3", 
                  "I_4_P12C", 
                  "I_4_P13C_O1", 
                  "I_5_P9C", 
                  "I_5_P10C", 
                  "I_5_P11_A3", 
                  "I_5_P12C", 
                  "I_5_P13C_O1", 
                  "I_6_P9C", 
                  "I_6_P10C", 
                  "I_6_P11C", 
                  "I_6_P11_A3", 
                  "I_6_P12C", 
                  "I_6_P13C_O1", 
                  "I_7_P9C", 
                  "I_7_P10C", 
                  "I_7_P11_A3", 
                  "I_7_P12C", 
                  "I_7_P13C_O1", 
                  "I_8_P9C", 
                  "I_8_P10C", 
                  "I_8_P11C", 
                  "I_8_P11_A3", 
                  "I_8_P12C", 
                  "I_8_P13C_O1", 
                  "I_1_P9D", 
                  "I_1_P10D", 
                  "I_1_P11D", 
                  "I_1_P11_A4", 
                  "I_1_P12D", 
                  "I_1_P13D_O1", 
                  "I_2_P9D", 
                  "I_2_P10D", 
                  "I_2_P11D", 
                  "I_2_P11_A4", 
                  "I_2_P12D", 
                  "I_2_P13D_O1", 
                  "I_1_P9E", 
                  "I_1_P10E", 
                  "I_1_P11E", 
                  "I_1_P11_A5", 
                  "I_1_P12E", 
                  "I_1_P13E_O1", 
                  "I_2_P9E", 
                  "I_2_P10E", 
                  "I_2_P11E", 
                  "I_2_P11_A5", 
                  "I_2_P12E", 
                  "I_2_P13E_O1", 
                  "I_3_P9E", 
                  "I_3_P10E", 
                  "I_3_P11E", 
                  "I_3_P11_A5", 
                  "I_3_P12E", 
                  "I_3_P13E_O1", 
                  "P20A", 
                  "P18B", 
                  "P19B", 
                  "I_1_P18C", 
                  "I_1_P19C", 
                  "I_1_P20C", 
                  "I_2_P18C", 
                  "I_1_P18D", 
                  "I_2_P18D", 
                  "I_2_P19D", 
                  "I_2_P20D", 
                  "I_3_P18D", 
                  "I_3_P19D", 
                  "I_3_P20D", 
                  "I_4_P18D", 
                  "I_4_P19D", 
                  "I_4_P20D", 
                  "I_5_P18D", 
                  "I_5_P19D", 
                  "I_5_P20D", 
                  "I_6_P18D", 
                  "I_6_P19D", 
                  "I_6_P20D", 
                  "I_7_P18D", 
                  "I_7_P19D", 
                  "I_7_P20D", 
                  "I_8_P18D", 
                  "I_8_P19D", 
                  "I_8_P20D", 
                  "I_1_P18E", 
                  "I_1_P19E", 
                  "I_1_P20E", 
                  "I_2_P18E", 
                  "I_2_P19E", 
                  "I_2_P20E", 
                  "I_3_P18E", 
                  "I_3_P19E", 
                  "I_3_P20E", 
                  "I_4_P18E", 
                  "I_4_P19E", 
                  "I_4_P20E", 
                  "I_5_P18E", 
                  "I_5_P19E", 
                  "I_5_P20E", 
                  "I_6_P18E", 
                  "I_6_P19E", 
                  "I_6_P20E", 
                  "I_7_P18E", 
                  "I_7_P19E", 
                  "I_7_P20E", 
                  "I_8_P18E", 
                  "I_8_P19E", 
                  "I_8_P20E", 
                  "Child1", 
                  "I_1_NEW_2_cl", 
                  "I_1_P19_cl", 
                  "I_1_D_9_cl", 
                  "I_1_D_4_cl", 
                  "I_2_NEW_2_cl", 
                  "I_2_P19_cl", 
                  "I_2_D_9_cl", 
                  "I_2_D_4_cl", 
                  "I_3_NEW_2_cl", 
                  "I_3_P19_cl", 
                  "I_3_D_9_cl", 
                  "I_3_D_4_cl", 
                  "I_4_NEW_2_cl", 
                  "I_4_P19_cl", 
                  "I_4_D_4_cl", 
                  "I_5_NEW_2_cl", 
                  "I_5_P19_cl", 
                  "I_5_D_9_cl", 
                  "I_5_D_4_cl", 
                  "I_6_NEW_2_cl", 
                  "I_6_P19_cl", 
                  "I_6_D_4_cl", 
                  "update6_1", 
                  "update6_2", 
                  "E2_2")

capture_tables (indirect_PII)

# Recode those with very specific values where more than half of the sample have actual data. 

mydata <- encode_direct_PII_team (variables="E2_2") # Encode as low frequencies on languages. 
mydata <- top_recode ("HC3", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members. 

# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata2 <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)

#' #Matching and crosstabulations: Run automated PII check 
 
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_4', 'IDR3_20') ##!!! Replace with candidate categorical demo vars

# weight variable
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
# selectedHouseholdID = c('wpid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

# Recode of education and age to reduce risk of re-identification 

break_edu <- c(0,6,9,11,12,13,15,16,777,888,999)
labels_edu <- c("Primary or less (0-5)" = 1,
                "Lower secondary (6-8)" = 2,
                "Secondary (9-10)" = 3,
                "SLC (11)" = 4,
                "CLASS 12/Intermediate level (12)" = 5,
                "Bachelor/Postgraduate level" = 6,
                "Literate, but never attended school" = 7,
                "Illiterate, and never attended school"= 8,
                "Does not apply"= 9,
                "Don't Know"= 10,
                "NA"= 11)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_1", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_2", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_3", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_4", break_points=break_edu, missing=999999, value_labels=labels_edu)

# Re-run to check 2-anonimity

selectedKeyVars = c('D_4', 'IDR3_20')
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

#' Show values of key variable of records that violate k-anonymity
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
mydata [notAnon,"D_4"] <- 9

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("I_1_P14D_12_TEXT", 
               "RvwComment", 
               "SrvyrComment", 
               "H2_12_TEXT", 
               "HTNx3_2_14_TEXT", 
               "HTV_1_10_TEXTx3", 
               "HTV_3_11_TEXTx3", 
               "CPR5i_TEXT", 
               "G1_00_08_TEXT", 
               "P13A_10_TEXT", 
               "P14A_12_TEXT", 
               "P13B_10_TEXT", 
               "P14B_12_TEXT", 
               "SIMPOC7B_10_TEXT", 
               "I_1_P13C_10_TEXT", 
               "I_1_P14C_12_TEXT", 
               "I_1_SIMPOC7C_10_TEXT", 
               "I_2_P14C_12_TEXT", 
               "I_1_P13D_10_TEXT", 
               "I_1_P14D_12_TEXT", 
               "I_2_P14D_12_TEXT", 
               "I_1_P13E_10_TEXT", 
               "I_1_P14E_12_TEXT", 
               "I_1_SIMPOC7E_10_TEXT", 
               "I_2_P14E_12_TEXT", 
               "I_3_P14E_12_TEXT", 
               "NEW_3_12_TEXT", 
               "NEW_9", 
               "I_1_Q_559_S", 
               "I_1_SIMPOC7_cl_10_TEXT", 
               "I_2_NEW_9_cl", 
               "I_2_SIMPOC7_cl_10_TEXT", 
               "I_3_NEW_9_cl", 
               "I_3_Q_559_S", 
               "I_3_SIMPOC7_cl_10_TEXT", 
               "e3e_TEXT", 
               "E2_11_8_TEXT", 
               "E_14_7_TEXT", 
               "L1_other_text", 
               "L2_other_text")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata <- mydata[!names(mydata) %in% "SrvyrComment"]

#' #GPS data: Displace
# Setup map

countrymap <- map_data("world") %>% filter(region=="Nepal")  #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")

# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement. 

gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

gps.vars <- c("gps_CEa_LO", "gps_CEa_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

gps.vars <- c("gpsenumimp_LO", "gpsenumimp_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

#' #Save processed data in stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav")) 

haven_table("update6_1")